## Rows: 342,867
## Columns: 30
## $ report_dat <chr> "5/8/2012 8:38:00 PM", "11/24/2009 10:23:00 PM...
## $ month <dbl> 5, 11, 7, 4, 11, 2, 2, 9, 1, 10, 1, 10, 1, 6, ...
## $ day <dbl> 8, 24, 15, 26, 29, 24, 27, 20, 5, 17, 9, 22, 3...
## $ year <dbl> 2012, 2009, 2014, 2011, 2011, 2017, 2017, 2015...
## $ hour <dbl> 20, 22, 21, 11, 16, 2, 14, 21, 11, 4, 12, 3, 1...
## $ minute <dbl> 38, 23, 5, 30, 10, 38, 39, 30, 55, 53, 40, 4, ...
## $ second <dbl> 0, 0, 0, 0, 0, 47, 42, 14, 0, 14, 21, 28, 0, 0...
## $ anc <chr> "7C", "6C", "6E", "7C", "6C", "2C", "3G", "1B"...
## $ block <chr> "5400 - 5499 BLOCK OF DIX STREET NE", "700 - 7...
## $ block_group <chr> "007808 1", "008302 1", "004802 1", "007808 3"...
## $ crimetype <chr> "Non-Violent", "Non-Violent", "Non-Violent", "...
## $ end_date <chr> "5/8/2012 9:00:00 PM", "11/24/2009 9:00:00 PM"...
## $ ew <chr> "East", "East", "East", "East", "East", "East"...
## $ method <chr> "OTHERS", "OTHERS", "OTHERS", "OTHERS", "OTHER...
## $ neighborhood_cluster <chr> "Cluster 31", "Cluster 25", "Cluster 8", "Clus...
## $ ns <chr> "North", "North", "North", "North", "North", "...
## $ offense <chr> "BURGLARY", "THEFT F/AUTO", "THEFT/OTHER", "TH...
## $ quad <chr> "Northeast", "Northeast", "Northeast", "Northe...
## $ shift <chr> "EVENING", "EVENING", "EVENING", "DAY", "EVENI...
## $ start_date <chr> "5/8/2012 12:45:00 PM", "11/24/2009 5:00:00 PM...
## $ voting_precinct <chr> "Precinct 96", "Precinct 83", "Precinct 18", "...
## $ ccn <dbl> 12063047, 9168589, 14402225, 11056847, 1117511...
## $ census_tract <dbl> 7808, 8302, 4802, 7808, 10600, 5900, 1500, 430...
## $ district <dbl> 6, 1, 3, 6, 5, 1, 2, 3, 1, 2, 7, 1, 2, 3, 3, 4...
## $ psa <dbl> 608, 104, 308, 602, 506, 102, 201, 305, 108, 2...
## $ ward <dbl> 7, 6, 6, 7, 6, 2, 4, 1, 6, 3, 8, 2, 2, 2, 6, 1...
## $ x <dbl> 242612, 161081, 178473, 217163, 167233, 35440,...
## $ x1 <dbl> 242612, 161081, 178473, 217163, 167233, 35440,...
## $ xblock <dbl> -76.92233, -76.99557, -77.01756, -76.92188, -7...
## $ yblock <dbl> 38.89472, 38.90020, 38.90564, 38.89463, 38.905...
#remove unused cols
a = a %>% select(!c(minute, second, anc, block, block_group, end_date, ew, neighborhood_cluster, ns, start_date, voting_precinct, ccn, district, x, x1)) %>%
mutate(
report_dat = anytime::anydate(report_dat),
#start_date = anytime::anydate(start_date),
#end_date = anytime::anydate(end_date),
across(where(is.character), factor),
census_tract = factor(census_tract, levels = a$census_tract %>% unique %>% sort),
ward = factor(ward, levels = a$ward %>% unique %>% sort),
psa = factor(psa, levels = a$psa %>% unique %>% sort),
year = factor(year, levels = a$year %>% unique %>% sort),
month = factor(month, levels = a$month %>% unique %>% sort),
day = factor(day, levels = a$day %>% unique %>% sort),
hour = factor(hour, levels = a$hour %>% unique %>% sort)
) %>%
select(sort(tidyselect::peek_vars())) %>%
select(
where(is.Date),
month, day, year, hour,
where(is.character),
where(is.factor),
where(is.numeric)
) %>% arrange(report_dat, crimetype, offense)
abak = a
#saveRDS(abak, 'cleaned_data_10yrs.RDS')
#a = abak## Rows: 10,594
## Columns: 12
## $ month <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ day <fct> 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, ...
## $ year <fct> 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, 2012, ...
## $ hour <fct> 3, 13, 8, 21, 16, 20, 20, 7, 11, 3, 8, 13, 8, 21, 17, ...
## $ census_tract <fct> 7809, 9204, 10600, 7603, 3301, 10200, 7803, 9810, 1090...
## $ crimetype <fct> Non-Violent, Non-Violent, Non-Violent, Non-Violent, No...
## $ method <fct> OTHERS, OTHERS, OTHERS, OTHERS, OTHERS, OTHERS, OTHERS...
## $ offense <fct> BURGLARY, MOTOR VEHICLE THEFT, THEFT F/AUTO, MOTOR VEH...
## $ psa <fct> 608, 502, 501, 606, 501, 105, 602, 708, 708, 403, 107,...
## $ quad <fct> Northeast, Northeast, Northeast, Southeast, Northeast,...
## $ shift <fct> MIDNIGHT, DAY, DAY, EVENING, EVENING, EVENING, EVENING...
## $ ward <fct> 7, 5, 6, 7, 5, 6, 7, 8, 8, 4, 6, 8, 5, 1, 1, 6, 6, 1, ...
#defining custom color palette
jpal = grDevices::colorRampPalette(brewer.pal(8,'Dark2'))(25)
a %>% select(where(is.factor)) %>% map_df(n_unique)## $month
## [1] 1 2 3 4 5 6 7 8 9 10 11 12
## Levels: 1 2 3 4 5 6 7 8 9 10 11 12
##
## $day
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## [26] 26 27 28 29 30 31
## 31 Levels: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 ... 31
##
## $year
## [1] 2012 2013 2014 2015 2016 2017
## Levels: 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017
##
## $hour
## [1] 3 13 8 21 16 20 7 11 17 14 9 10 4 18 12 15 2 0 19 6 22 1 23 5
## Levels: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
##
## $census_tract
## [1] 7809 9204 10600 7603 3301 10200 7803 9810 10900 2002 8200 7504
## [13] 8803 2802 3000 8410 3200 5201 2702 9604 9505 9504 11100 9102
## [25] 4002 9700 4802 6600 4400 11000 4600 9803 502 600 8701 9507
## [37] 6202 3700 5800 2701 7502 5002 9503 9903 9811 2202 10400 4801
## [49] 4100 8804 8904 9603 8903 9203 3100 1702 5600 9509 10100 4902
## [61] 7304 1001 7709 202 8100 8402 5500 1901 2900 8302 9601 6801
## [73] 2101 9400 9301 7409 7708 4702 3400 6400 7707 400 5001 7401
## [85] 4202 10700 2502 6500 1600 2102 4300 7403 7200 9602 100 7404
## [97] 3800 3600 9802 8001 9904 5900 7703 2600 3302 8802 7100 9901
## [109] 2801 300 501 7901 7406 2501 7503 8301 1100 9907 10300 7407
## [121] 1002 4901 9807 7605 8002 9508 9501 7408 5301 1402 4001 901
## [133] 3500 8702 7000 2201 10800 2400 7604 1804 10500 9000 9302 1302
## [145] 7808 1803 9905 1200 7601 9804 2001 9201 7804 4201 9906 9902
## [157] 9801 4701 7806 702 3900 802 701 7807 6802 6900 2302 6804
## [169] 2301 7903 201 1401 801 1301 6700 1902 7301 1500 902
## 179 Levels: 100 201 202 300 400 501 502 600 701 702 801 802 901 902 ... 11100
##
## $crimetype
## [1] Non-Violent Violent
## Levels: Non-Violent Violent
##
## $method
## [1] OTHERS GUN KNIFE
## Levels: GUN KNIFE OTHERS
##
## $offense
## [1] BURGLARY MOTOR VEHICLE THEFT
## [3] THEFT F/AUTO THEFT/OTHER
## [5] ASSAULT W/DANGEROUS WEAPON ROBBERY
## [7] SEX ABUSE ARSON
## [9] HOMICIDE
## 9 Levels: ARSON ASSAULT W/DANGEROUS WEAPON BURGLARY ... THEFT/OTHER
##
## $psa
## [1] 608 502 501 606 105 602 708 403 107 701 506 302 104 409 307 408 603 406 504
## [20] 503 303 706 308 305 707 204 404 102 304 101 702 405 103 505 306 208 507 401
## [39] 207 705 202 605 206 402 601 108 704 703 301 203 106 604 201 407 607 205
## 56 Levels: 101 102 103 104 105 106 107 108 201 202 203 204 205 206 207 ... 708
##
## $quad
## [1] Northeast Southeast Northwest
## Levels: Northeast Northwest Southeast
##
## $shift
## [1] MIDNIGHT DAY EVENING
## Levels: DAY EVENING MIDNIGHT
##
## $ward
## [1] 7 5 6 8 4 1 2 3
## Levels: 1 2 3 4 5 6 7 8
a %>% filter(year != 2017) %>% select(where(is.factor)) %>% DataExplorer::plot_bar(ncol = 1, nrow = 1, title = 'Total Offenses by Category - Historic')## 2 columns ignored with more than 50 categories.
## census_tract: 179 categories
## psa: 56 categories
a %>% count(psa, sort = TRUE, name = 'count') %>%
head(10) %>%
mutate(psa = factor(psa)) %>%
mutate(psa = fct_reorder(psa, -count)) %>%
plot_ly(x = ~psa, y = ~count, color = ~psa, colors = jpal) %>%
add_bars() %>%
layout(
title = 'Police Service Areas with the Most Crime'
)Observations:
a %>% filter(year == 2017) %>% select(where(is.factor)) %>% DataExplorer::plot_bar(ncol = 1, nrow = 1, title = 'Total Offenses by Category - 2017')## 2 columns ignored with more than 50 categories.
## census_tract: 174 categories
## psa: 56 categories
ipal = grDevices::colorRampPalette(brewer.pal(12,'Paired'))(56)
a %>% filter(year == 2017) %>% plot_ly(x = ~xblock, y = ~yblock, color = ~quad, colors = ipal) %>% add_markers() %>% layout(title = 'WDC Quadrants')a %>% filter(year == 2017) %>% plot_ly(x = ~xblock, y = ~yblock, color = ~ward, colors = ipal) %>% add_markers() %>% layout(title = 'WDC Wards')a %>% filter(year == 2017) %>% plot_ly(x = ~xblock, y = ~yblock, color = ~psa, colors = ipal) %>% add_markers() %>% layout(title = 'WDC PSAs') %>% hide_legend()Observations relative to Historic
Recommendations for the year 2018 based on 2017 findings
ggplotly(
a %>% filter(year != 2017) %>%
count(ward, offense) %>% ggplot(aes(x = offense, y = n, fill = offense)) +
geom_col() +
coord_flip() +
labs(x = '', y = 'count', title = 'Total Offenses by Type/Ward - Historic') +
facet_wrap(~ward) +
theme(legend.position = 'none')
)Observations:
ggplotly(
a %>% filter(year == 2017) %>%
count(ward, offense) %>% ggplot(aes(x = offense, y = n, fill = offense)) +
geom_col() +
coord_flip() +
labs(x = '', y = 'count', title = 'Total Offenses by Type/Ward - 2017') +
facet_wrap(~ward) +
theme(legend.position = 'none')
)Observations relative to Historic
Recommendations for the year 2018 based on 2017 findings
# create col for start of month (a 'month' col) used for grouping and graphing
a = a %>% mutate(
monthkey = lubridate::make_datetime(
as.numeric(as.character(year)),
as.numeric(as.character(month)),
1)
) %>% relocate(report_dat, monthkey, everything())#Total Offenses over time
a %>% group_by(monthkey) %>%
summarise(total.offenses = n()) %>%
ungroup() %>%
plot_ly(x = ~monthkey, y =~total.offenses) %>%
add_lines(size = I(3)) %>% layout(
xaxis = list(title = ''),
yaxis = list(title = ''),
title = 'Total Offenses by Month'
)Observations
ggplotly(
a %>% group_by(monthkey, offense)%>%
summarise(total.offenses = n()) %>%
ungroup() %>%
mutate(offense = fct_reorder(offense, total.offenses, .fun = mean)) %>%
ggplot(aes(monthkey, total.offenses, fill = offense)) +
geom_area() +
labs(title = 'Total Offenses Percentage (#) by Month/Offense Type', x = '', y = '')
)ggplotly(
a %>% group_by(monthkey, offense)%>%
summarise(total.offenses = n()) %>%
mutate(total.offenses.pct = total.offenses/sum(total.offenses)) %>%
ungroup() %>%
mutate(offense = fct_reorder(offense, total.offenses, .fun = mean)) %>%
ggplot(aes(monthkey, total.offenses.pct, fill = offense)) +
geom_area() +
scale_y_continuous(labels = scales::percent) +
labs(title = 'Total Offenses Percentage (%) by Month/Offense Type', x = '', y = '')
)ggplotly(
a %>% group_by(monthkey, ward)%>%
summarise(total.offenses = n()) %>%
ungroup() %>%
mutate(ward = fct_reorder(ward, total.offenses, .fun = mean)) %>%
ggplot(aes(monthkey, total.offenses, fill = ward)) +
geom_area() +
labs(title = 'Total Offenses Percentage (#) by Month/Ward', x = '', y = '')
)ggplotly(
a %>% group_by(monthkey, ward)%>%
summarise(total.offenses = n()) %>%
mutate(total.offenses.pct = total.offenses/sum(total.offenses)) %>%
ungroup() %>%
mutate(ward = fct_reorder(ward, total.offenses, .fun = mean)) %>%
ggplot(aes(monthkey, total.offenses.pct, fill = ward)) +
geom_area() +
scale_y_continuous(labels = scales::percent) +
labs(title = 'Total Offenses Percentage (%) by Month/Ward', x = '', y = '')
) ## Rows: 10,565
## Columns: 2
## $ xblock <dbl> -76.92713, -76.99716, -77.00740, -76.94992, -77.01060, -77.0...
## $ yblock <dbl> 38.90078, 38.92093, 38.90645, 38.86128, 38.92023, 38.87786, ...
## == Use anomalize to improve yo
## Business Science offers a 1-hour course - Lab #18: Time Series Anomaly Detection!
## </> Learn more at: https://university.business-science.io/p/learning-labs-pro </>
# anomalize(data, target, method = c("iqr", "gesd"), alpha = 0.05, max_anoms = 0.2, verbose = FALSE)
# alpha: Controls the width of the "normal" range. Lower values are more conservative while higher values are less prone to incorrectly classifying "normal" observations.
# max_anoms: The maximum percent of anomalies permitted to be identified.
a.anomalize = a %>%
group_by(monthkey) %>%
summarise(total.offenses = n()) %>%
time_decompose(total.offenses, method = 'twitter', merge = TRUE) %>%
anomalize(remainder, alpha = 0.30, method = 'gesd') %>%
time_recompose()## `summarise()` ungrouping output (override with `.groups` argument)
## Converting from tbl_df to tbl_time.
## Auto-index message: index = monthkey
## frequency = 12 months
## median_span = 35.5 months
ggplotly(
a.anomalize %>%
filter(monthkey < as.Date('2017-11-01')) %>%
plot_anomalies(
ncol = 1,
alpha_dots = 0.5,
alpha_circles = 0.5,
size_circles = 1.5,
time_recomposed = TRUE,
alpha_ribbon = 0.05
) + scale_y_continuous(labels = comma) +
labs(x = '', y = 'total.offenses', title = 'total.offenses')
)library(prophet)
#renaming cols to prophet's col conventions
a.agg = a %>%
#filter(crimetype == 'Violent') %>%
group_by(report_dat = round_date(report_dat,'month')) %>%
summarise(total.offenses = n()) %>%
select(ds = report_dat, y = total.offenses)
#creating model
a.agg.model = a.agg %>% prophet()
#using model make future period df
a.agg.future = a.agg.model %>% make_future_dataframe(
periods = 3, #next 2 months
freq = 'month')
#make forecasts df
a.agg.forecast = a.agg.model %>% predict(a.agg.future)
#plot forecast
a.agg.model %>% dyplot.prophet(a.agg.forecast)## Warning: `select_()` is deprecated as of dplyr 0.7.0.
## Please use `select()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.